# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Re-capitalize stringsx <-c("R4SS: Introduction to R for Social Scientists")str_to_lower(x)str_to_upper(x)str_to_sentence(x)str_to_title(x)# ==============================================================================# USECASE: Extract subsets of stringsx <-c("Apple", "Banana", "Pear")str_sub(x, start =1, end =3)str_sub(x, start =-3, end =-1)str_sub(x, start =2, end =-2)str_sub(x, start =1, end =5) # can go beyond the end# ==============================================================================# USECASE: Remove whitespace from stringsx <-" Sometimes strings have too much white space "xstr_trim(x) # remove white space at the start and endstr_squish(x) # trim and then collapse inner white space# ==============================================================================# USECASE: Remove and replace patterns in stringsx <-"Scientists very often utilize very fancy words, even when they could utilize simpler ones."str_remove(x, pattern ="very ") # removes first pattern match onlystr_remove_all(x, pattern ="very ") # removes all pattern matchesstr_replace(x, pattern ="utilize", replacement ="use")str_replace_all(x, pattern ="utilize", replacement ="use")# NOTE: More complex patterns can be found using regular expressions (regex)# ==============================================================================# USECASE: Create a string manipulation pipelinex_clean <- x |>str_remove_all("very ") |>str_replace_all("utilize", "use") |>print()
If Else
A locked door behaves conditionally
If you have the key, then open up…
Otherwise, stay closed…
Sometimes we want code to behave conditionally
Filter retains observations conditionally (e.g., if it meets a condition, it gets to stay)
Let’s learn to transform variables conditionally
We can use if_else() for simple examples
If Else Live Coding
# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Determining whether someone can vote in the USage <-12age_group <-if_else(condition = age >=18,true ="adult",false ="child" ) |>print()# ==============================================================================# TIP: Because argument names are optional, we can shorten this (if we want)age_group <-if_else(age >=18, "adult", "child") |>print()# ==============================================================================# LESSON: This function is particularly useful applied to vectorsages <-c(13, 18, 14, 19, 22, 16)age_groups <-if_else(ages >=18, "adult", "child") |>print()# ==============================================================================# USECASE: We can therefore use it during data wranglingcereal <-read_csv("cereal.csv", na ="-999")cereal2 <-mutate(cereal, popular =if_else(rating >50, "yes", "no"))cereal2cereal3 <- cereal |>mutate(diabetes =if_else(condition = sugars ==0,true ="sugar-free",false ="contains sugar" ) ) |>print()
Case When
An elevator also behaves conditionally
If you press a button, then it goes to that floor
There are usually more than just two buttons
In this analogy (but not in real life), the elevator only responds to the first button pressed
Sometimes we want code to behave this way
case_when() expands upon if_else()
It can have multiple conditions (floor buttons)
The first condition met “wins” (picks the floor)
Case When Live Coding
# SETUP: We will need tidyverse for almost all of these functionslibrary(tidyverse)# ==============================================================================# USECASE: Determine what types of movies your kids can watchages <-c(11, 13, 18)movies_allowed <-case_when( ages >=17~"R", ages >=13~"PG-13", ages <13~"PG" ) |>print()# ==============================================================================# PITFALL: Don't put the least restrictive condition firstage <-18movies_allowed2 <-case_when( age <13~"PG", age >=13~"PG-13", age >=17~"R" ) |>print() # age >= 13, so PG-13 wins before checking if age >= 17# ==============================================================================# USECASE: Use case_when to re-code variables during data wranglingstarwarssw <- starwars |>mutate(species3 =case_when( species =="Human"~"Human", species =="Droid"~"Droid", species !="Human"& species !="Droid"~"Alien" ) ) |>select(name, species3) |>print()# ==============================================================================# TIP: The next version of case_when() will add the .default argument# This is where the elevator will drop you off if you hit no buttonssw <- starwars |>mutate(species3 =case_when( species =="Human"~"Human", species =="Droid"~"Droid",.default ="Alien" ) ) |>select(name, species3) |>print()# NOTE: The above code won't work now, but it should in a few weeks/months# For now, you can use TRUE ~ "Alien" and it works but is harder to explain
Wrangle X
Pivot Longer and Wider
Both long and wide formats can be tidy
Long formats are better for MLM
Wide formats are better for SEM
It can be useful to quickly reshape a tibble
pivot_longer(): wide → long
pivot_wider(): long → wide
Pivot Longer Live Coding
# SETUP: We will need tidyverse and an example dataset (from workshop website)library(tidyverse)gradebook <-read_csv("gradebook.csv") |>print()# ==============================================================================# USECASE: We can pivot to long format by creating name and value variablesgradebook2 <- gradebook |>pivot_longer(cols =c(test1, test2, test3, test4, test5), names_to ="test", values_to ="grade" ) |>print()# ==============================================================================# TIP: Use selection helpers to select columns quicklygradebook2 <- gradebook |>pivot_longer(cols = test1:test5, names_to ="test", values_to ="grade" ) |>print()# ==============================================================================# LESSON: Automatically remove the name prefixgradebook2 <- gradebook|>pivot_longer(cols =starts_with("test"), names_to ="test", values_to ="grade",names_prefix ="test" ) |>print()
Pivot Wider Live Coding
# SETUP: We will need tidyverse and an example dataset (from workshop website)library(tidyverse)diary <-read_csv("diary.csv") |>print()# ==============================================================================# USECASE: Reshape this long format to a wider formatdiary_scale <- diary |>pivot_wider(names_from ="scale",values_from ="score" ) |>print()diary_day <- diary |>pivot_wider(names_from ="day",values_from ="score" ) |>print()# NOTE: There are thus multiple possible wide formats (for different uses)# ==============================================================================# LESSON: We can add a prefix to each name to avoid numeric namesdiary_datadiary_day <- diary |>pivot_wider(names_from ="day",values_from ="score",names_prefix ="day_" ) |>print()# ==============================================================================# LESSON: We can also pivot on multiple columns at oncediary_double <- diary |>pivot_wider(names_from =c("scale", "day"),values_from ="score" ) |>print()
Across
We can use across() to repeat an operation across multiple variables in a tibble
This makes our code shorter
It is faster to read and write
It is also less error-prone
So we can repeat a function in order to…
…mutate() multiple variables
…summarize() multiple variables
Across Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)starwars# ==============================================================================# USECASE: Applying the same mutation to multiple variables is a painsw <- starwars |>mutate(hair_color =factor(hair_color),skin_color =factor(skin_color),eye_color =factor(eye_color) ) |>print() # beforesw <- starwars |>mutate(across(.cols =c(hair_color, skin_color, eye_color), .fns = factor ) ) |>print() #after# ==============================================================================# PITFALL: Don't forget to wrap the .cols part in c()sw <- starwars |>mutate(across(.cols = mass, birth_year, .fns = round,digits =1 ) ) |>print() # error# ==============================================================================# LESSON: To pass arguments to the inner function, add them inside across()sw <- starwars |>mutate(across(.cols =c(mass, birth_year), .fns = round,digits =1 ) ) |>print()# ==============================================================================# USECASE: You can also apply the same summary functions across variablessw <- starwars |>summarize(height =mean(height, na.rm =TRUE),mass =mean(mass, na.rm =TRUE),birth_year =mean(birth_year, na.rm =TRUE) ) |>print()sw <- starwars |>summarize(across(.cols =c(height, mass, birth_year), .fns = mean, na.rm =TRUE ) ) |>print()
Separate and Unite
Tidy data needs one value per cell
So we may need to separate cells
e.g., What was the model of my first car?
"Nissan Altima 2003" ⬎
"Nissan""Altima""2003"
But some tasks require us to unite cells
e.g., What address should I mail to?
123"Main Street" ⬎
"123 Main Street"
Separate Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)# Create some example datadat <-tibble(id =c("A_001_01", "A_002_01", "B_001_01", "B_002_01", "C_001_01", "C_002_01"),duration =c("01:16", "01:21", "01:49", "00:34", "00:32", "00:54") ) |>print()# ==============================================================================# USECASE: Separate a column into multiple columnsdat2 <- dat |>separate(col = duration, into =c("min", "sec"), sep =":" ) |>print()# ==============================================================================# USECASE: This also works with more than two "into" columnsdat2 <- dat |>separate(col = id, into =c("group", "subject", "time"), sep ="_" ) |>print()# ==============================================================================# TIP: To automatically convert strings into numbers, use convertdat2 <- dat |>separate(col = id, into =c("group", "subject", "time"), sep ="_", convert =TRUE ) |>print()# ==============================================================================# PITFALL: Don't forget to close the "into" vector's parenthesesdat2 <- dat|>separate(col = duration, into =c("min", "sec", sep =":")) #error
Unite Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)dat2 <- dat|>separate(col = id, into =c("group", "subject", "time"), sep ="x") |>separate(col = duration, into =c("min", "sec"), sep =":") |>print()# ==============================================================================# USECASE: Unite multiple columns into one stringdat3 <- dat2 |>unite(col ="newid", group, subject, time, sep ="-") |>unite(col ="duration", min, sec, sep =":") |>print()# ==============================================================================# LESSON: Retain the columns being united with remove = FALSEdat3 <- dat2 |>unite(col ="newid", group:time, sep ="", remove =FALSE) |>print()
Visualize X
Other Aesthetics
For blocky elements like bars…
color controls the outline color
fill controls the internal color
size controls the line thickness
Some mappings will induce grouping
You’ll get separate geoms per group
It can be helpful to use redundant mapping
Map one variable to multiple aesthetics
Then if one “fails” the other may work
Other Aesthetics Live Coding
# SETUP: We will need tidyverse and an example datasetlibrary(tidyverse)mpg# ==============================================================================# USECASE: Mapping the shape aesthetic to a categorical variableggplot(mpg, aes(x = displ, y = hwy, shape = drv)) +geom_point(size =3)# ==============================================================================# PITFALL: Don't try to map shape to a continuous variableggplot(mpg, aes(x = displ, y = hwy, shape = hwy)) +geom_point() #error# NOTE: This doesn't work because there are way more numbers than shapes# ==============================================================================# LESSON: Color vs. Fill and Size for Blocksggplot(mpg, aes(y = class)) +geom_bar()ggplot(mpg, aes(y = class)) +geom_bar(color ="darkred", fill ="lightblue", size =1)# ==============================================================================# LESSON: Some aesthetics cause grouping when mapped to a categorical variableggplot(mpg, aes(x = displ, y = hwy)) +geom_point() +geom_smooth(method ="lm") # single smoothggplot(mpg, aes(x = displ, y = hwy, color = drv)) +geom_point() +geom_smooth(method ="lm") # three smooths# ==============================================================================# USECASE: Mapping to the fill aesthetic and setting the alpha propertyggplot(mpg, aes(x = hwy, fill = drv)) +geom_density()ggplot(mpg, aes(x = hwy, fill = drv)) +geom_density(alpha =0.3)# ==============================================================================# TIP: If you map the same variable to multiple aesthetics, you get redundancyggplot(mpg, aes(x = displ, y = hwy, shape = drv, color = drv)) +geom_point(size =3) # if color fails, shape still works